PilotD2Redo - QC and exploratory analysis¶
In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import os
import yaml
import scanpy as sc
import rapids_singlecell as rsc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.autolayout'] = True
# Increase all font sizes
plt.rcParams['font.size'] = 16 # Base font size
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams['legend.fontsize'] = 15
from preprocess import _convert_oak_path
import qc_plots
/home/emmadann/.local/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Configuration
In [3]:
# Read config
experiment_name = 'PilotD2Redo_Lane2'
config_file = '../../metadata/experiments_config.yaml'
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
config = config[experiment_name]
datadir = _convert_oak_path(config['datadir'])
sample_metadata_csv = _convert_oak_path(config['sample_metadata'])
PLOTDIR = f'../../results/{experiment_name}/'
sc.settings.figdir = PLOTDIR
def save_plot(pl_name, plot_dir = None):
if plot_dir is None:
plot_dir = PLOTDIR
plt.savefig(f'{plot_dir}/{pl_name}.pdf');
plt.savefig(f'{plot_dir}/{pl_name}.png');
Read merged dataset¶
In [4]:
sample_metadata = pd.read_csv(sample_metadata_csv, index_col=0)
sgrna_library_metadata = pd.read_csv('../../metadata/sgRNA_library_curated.csv', index_col=0)
sample_metadata
Out[4]:
| experiment_id | cell_sample_id | donor_id | culture_condition | library_id | library_prep_kit | probe_hyb_loading | GEM_loading | |
|---|---|---|---|---|---|---|---|---|
| 0 | PilotD2Redo_Lane2 | PilotD2Redo_Day7 | CE0010216 | Day7 | PilotD2Redo_Lane2_Day7 | GEMX_flex_v2 | 1M cells, 10 uL probe | 3x |
| 1 | PilotD2Redo_Lane2 | PilotD2Redo_Rest | CE0010216 | Rest | PilotD2Redo_Lane2_Rest | GEMX_flex_v2 | 1M cells, 10 uL probe | 3x |
| 2 | PilotD2Redo_Lane2 | PilotD2Redo_Restim6hr | CE0010216 | Restim6hr | PilotD2Redo_Lane2_Restim6hr | GEMX_flex_v2 | 1M cells, 10 uL probe | 3x |
| 3 | PilotD2Redo_Lane2 | PilotD2Redo_Restim24hr | CE0010216 | Restim24hr | PilotD2Redo_Lane2_Restim24hr | GEMX_flex_v2 | 1M cells, 10 uL probe | 3x |
In [43]:
adata = sc.read_h5ad(f'{datadir}/{experiment_name}_merged.gex.lognorm.h5ad', backed=False)
adata.var = adata.var[['gene_ids', 'gene_name', 'mt']].copy()
adata.var_names = adata.var[ 'gene_name'].values
In [44]:
adata
Out[44]:
AnnData object with n_obs × n_vars = 668085 × 18129
obs: 'cell_sample_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_genes', 'batch', 'Unnamed: 0', 'experiment_id', 'donor_id', 'culture_condition', 'library_id', 'library_prep_kit', 'probe_hyb_loading', 'GEM_loading'
var: 'gene_ids', 'gene_name', 'mt'
uns: 'hvg', 'log1p', 'pca'
obsm: 'X_pca'
varm: 'PCs'
In [45]:
if 'guide_id' not in adata.obs:
# Load assignment of sgRNAs to cells
sgrna_assignment = pd.read_csv(f'{datadir}/sgrna_assignment.csv', index_col=0)
sgrna_assignment_index = sgrna_assignment.index.tolist()
sgrna_assignment = pd.merge(sgrna_assignment, sgrna_library_metadata.rename({"sgrna_id":'guide_id'}, axis=1), how='left')
sgrna_assignment.index = sgrna_assignment_index
existing_cols = [col for col in sgrna_assignment.columns if col in adata.obs.columns]
adata.obs.drop(columns=existing_cols, inplace=True)
adata.obs = pd.concat([adata.obs, sgrna_assignment.loc[adata.obs_names]], axis=1)
adata.obs['guide_type'] = np.where(adata.obs['guide_id'].str.startswith('NTC-'), 'non-targeting', 'targeting')
QC metrics¶
In [7]:
qc_plots.plot_ncells_sample(adata);
save_plot(f'{experiment_name}_n_cells_preQC')
In [73]:
sc.pl.violin(adata, 'log1p_total_counts', groupby='cell_sample_id', rotation=90)
In [71]:
fig, axs = plt.subplots(4,1, figsize=(5,10))
for i,m in enumerate(['log1p_total_counts','total_counts', 'n_genes', 'pct_counts_mt']):
sc.pl.violin(adata, m, groupby='cell_sample_id', rotation=90, show=False, ax=axs[i]);
if i != 3:
# remove x-axis tick labels
axs[i].set_xticklabels([])
fig.tight_layout()
fig.show()
In [69]:
sc.pl.scatter(adata, 'log1p_total_counts', 'log1p_n_genes_by_counts', color='culture_condition')
sc.pl.scatter(adata, 'log1p_n_genes_by_counts', 'pct_counts_in_top_100_genes', color='culture_condition', size=3)
In [13]:
adata.obs[['total_counts', 'n_genes', 'pct_counts_mt', 'cell_sample_id']].groupby('cell_sample_id').mean()
Out[13]:
| total_counts | n_genes | pct_counts_mt | |
|---|---|---|---|
| cell_sample_id | |||
| PilotD2Redo_Day7 | 14184.199219 | 4703.165669 | 0.612662 |
| PilotD2Redo_Rest | 7462.265137 | 3313.420798 | 0.550297 |
| PilotD2Redo_Restim6hr | 7886.190430 | 3294.112534 | 0.364597 |
| PilotD2Redo_Restim24hr | 7778.331055 | 3401.161530 | 0.824780 |
Estimate fraction of low quality cells (high fraction of mitochondrial genes, low number of captured genes)
In [51]:
adata.obs['low_quality'] = (adata.obs['pct_counts_mt'] > 5) | (adata.obs['n_genes'] < 1000)
In [52]:
# plot fraction of low_quality cells for each sample_id
low_quality_df = adata.obs.groupby(['cell_sample_id', 'culture_condition'])['low_quality'].mean().reset_index(name='fraction_low_quality')
low_quality_df
sns.barplot(data=low_quality_df, x='cell_sample_id', y='fraction_low_quality', hue='culture_condition', dodge=False);
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
plt.xticks(rotation=90);
sgRNA assignment QC metrics¶
In [74]:
import upsetplot
from upsetplot import from_contents
In [75]:
all_crispr_a = {}
for sample_id in sample_metadata['cell_sample_id']:
sgrna_h5ad = f"{datadir}{sample_id}.sgRNA.h5ad"
crispr_a = sc.read_h5ad(sgrna_h5ad)
all_crispr_a[sample_id] = crispr_a
In [30]:
# Get lists of inefficient and nonspecific guides for each sample
all_inefficient = {k:x.var_names[x.var['inefficient']].tolist() for k,x in all_crispr_a.items()}
all_nonspecific = {k:x.var_names[x.var['nonspecific']].tolist() for k,x in all_crispr_a.items()}
inef = from_contents(all_inefficient)
ax_dict = upsetplot.UpSet(inef, subset_size="count").plot()
nons = from_contents(all_nonspecific)
ax_dict = upsetplot.UpSet(nons).plot()
In [82]:
pl_df = adata.obs[['guide_id', 'cell_sample_id', 'low_quality']]
pl_df['group'] = 'targeting single sgRNA'
pl_df['group'] = np.where(adata.obs['guide_id'].str.startswith('NTC'), 'NTC single sgRNA', pl_df['group'])
pl_df['group'] = np.where(adata.obs['guide_id'].isna(), 'no sgRNA (>= 3 UMIs)', pl_df['group'])
pl_df['group'] = np.where(adata.obs['guide_id'] == 'multi_sgRNA', 'multi sgRNA (>= 3 UMIs)', pl_df['group'])
pl_df = pl_df[~pl_df['low_quality']].copy()
# Plot stacked barplot of fraction of cells in each group for each sample
group_counts = pl_df.groupby(['cell_sample_id', 'group']).size().unstack()
group_fractions = group_counts.div(group_counts.sum(axis=1), axis=0)
group_fractions.plot(kind='bar', stacked=True, figsize=(4,5))
plt.xticks(rotation=45, ha='right')
plt.ylabel('Fraction of cells')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title("3x loading");
plt.tight_layout()
In [83]:
group_fractions
Out[83]:
| group | NTC single sgRNA | multi sgRNA (>= 3 UMIs) | no sgRNA (>= 3 UMIs) | targeting single sgRNA |
|---|---|---|---|---|
| cell_sample_id | ||||
| PilotD2Redo_Day7 | 0.018776 | 0.344840 | 0.134183 | 0.502200 |
| PilotD2Redo_Rest | 0.020919 | 0.284717 | 0.181568 | 0.512796 |
| PilotD2Redo_Restim6hr | 0.020682 | 0.304761 | 0.176436 | 0.498120 |
| PilotD2Redo_Restim24hr | 0.021999 | 0.268535 | 0.195860 | 0.513607 |
In [51]:
# no_guide_cells = adata.obs_names[adata.obs['guide_id'].isna()]
# multi_guide_cells = adata.obs_names[adata.obs['guide_id'] == 'multi_sgRNA']
# fig, axes = plt.subplots(1, 4, figsize=(20, 5))
# for i, (sample_id, crispr_a) in enumerate(all_crispr_a.items()):
# crispr_a.obs['multi_sgrna'] = crispr_a.obs_names.isin(multi_guide_cells)
# crispr_a.obs['no_sgrna'] = crispr_a.obs_names.isin(no_guide_cells)
# crispr_a.obs['targeting_umis'] = np.array(crispr_a[:, crispr_a.var['sgrna_type'] == 'targeting'].X.sum(axis=1)).flatten()
# crispr_a.obs['ntc_umis'] = np.array(crispr_a[:, crispr_a.var['sgrna_type'] == 'NTC'].X.sum(axis=1)).flatten()
# crispr_a.obs['inefficient_umis'] = np.array(crispr_a[:, crispr_a.var['inefficient']].X.sum(axis=1)).flatten()
# crispr_a.obs['nonspecific_umis'] = np.array(crispr_a[:, crispr_a.var['nonspecific']].X.sum(axis=1)).flatten()
# ntc_n = sum(no_sgrna_a.obs['ntc_umis'] > 3)
# tot_n = no_sgrna_a.n_obs
# subtitle = f'{ntc_n}/{tot_n} multi sgRNA cells with NTC sgRNAs'
# no_sgrna_a = crispr_a[crispr_a.obs['multi_sgrna']].copy()
# axes[i].scatter(no_sgrna_a.obs['ntc_umis'], no_sgrna_a.obs['targeting_umis'], s=3)
# axes[i].set_xscale('log')
# axes[i].set_yscale('log')
# axes[i].set_xlabel('NTC sgRNA UMI counts')
# axes[i].set_ylabel('Targeting sgRNA UMI counts')
# axes[i].set_title('Multi sgRNA cells - ' + crispr_a.obs['cell_sample_id'][0] + '\n' + subtitle)
# plt.tight_layout()
# plt.show()
Dimensionality reduction¶
In [16]:
sc.pl.pca(adata, annotate_var_explained=True, color=['culture_condition'], components=['1,2'], wspace=0.5, ncols=3, size=2)
In [17]:
sc.pl.pca(adata, annotate_var_explained=True, color=['culture_condition', 'pct_counts_mt', 'log1p_n_genes_by_counts'], components=['1,2', '3,4', '5,6'], wspace=0.5, ncols=3, size=3)
Visualize genes with top loadings for each PC
In [18]:
sc.pl.pca_loadings(adata, components='1,2,3');
sc.pl.pca_loadings(adata, components='4,5,6')
Clustering¶
In [46]:
rsc.pp.neighbors(adata, n_neighbors=50)
rsc.tl.umap(adata)
rsc.tl.louvain(adata)
In [47]:
# adata.write_h5ad()
In [64]:
adata.obs.columns
Out[64]:
Index(['cell_sample_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts',
'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes',
'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
'pct_counts_in_top_500_genes', 'total_counts_mt',
'log1p_total_counts_mt', 'pct_counts_mt', 'n_genes', 'batch',
'Unnamed: 0', 'experiment_id', 'donor_id', 'culture_condition',
'library_id', 'library_prep_kit', 'probe_hyb_loading', 'GEM_loading',
'guide_id', 'top_guide_umi_counts', 'sequence', 'perturbed_gene_name',
'perturbed_gene_id', 'guide_type', 'louvain', 'low_quality'],
dtype='object')
In [65]:
sc.pl.umap(adata, color=['cell_sample_id'] + ['log1p_total_counts','log1p_n_genes_by_counts','pct_counts_mt', 'pct_counts_in_top_100_genes'], wspace=0.4, size=3, ncols=2)
In [53]:
sc.pl.umap(adata, color=['cell_sample_id'] + ['low_quality'], wspace=0.4, size=3)
In [50]:
qc_plots.plot_markers_sample_dotplot(adata, save=f'{experiment_name}_markers_sample_dotplot')
WARNING: saving figure to file ../../results/PilotD2Redo_Lane2/dotplot_PilotD2Redo_Lane2_markers_sample_dotplot.pdf
In [54]:
qc_plots.plot_markers_umap(adata, wspace=0.1, size=3, cmap='magma', ncols=5, save=f'{experiment_name}_markers.png')
WARNING: saving figure to file ../../results/PilotD2Redo_Lane2/umapPilotD2Redo_Lane2_markers.png